In [1]:
from setup_notebooks import *
%matplotlib inline

In [2]:
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)



In [29]:
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary
from collections import OrderedDict

Load previously cleaned data


In [3]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
df = pd.read_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
df.tokens


Out[3]:
0         ['python', 'never', 'stop', 'learning', 'what'...
1                       ['Watching', 'Boa', 'vs', 'Python']
2         ['Monty', 'Python', 'The', 'silly', 'walk', 'v...
3         ['Senior', 'Software', 'Engineer', 'Full', 'St...
4         ['Architect', 'Django', 'Solr', 'Platform', 'E...
5           ['peaceful', 'rain', 'Python', 'inevitability']
                                ...                        
183064    ['Las', 'mejores', 'ides', 'para', 'Python', '...
183065    ['Gagal', 'tidur', 'gegara', 'habis', 'vertica...
183066         ['Go', 'boa', 'wkwk', 'Boa', 'vs', 'Python']
183067    ['RT', 'RealPython', 'List', 'of', 'Python', '...
183068                  ['Watching', 'Boa', 'vs', 'Python']
183069    ['Чертова', 'дюжина', 'вакансий', 'в', 'IT', '...
Name: tokens, dtype: object

In [9]:
d = Dictionary.from_documents(([str(s) for s in row]for row in df.tokens))

In [4]:
df.tokens.iloc[0]


Out[4]:
"['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing']"

In [ ]:
# one way to fix this
df.tokens = df.tokens.apply(eval)

When we said "QUOTE_NONNUMERIC" we didn't mean ALL nonnumeric fields ;)

So we can recreate the token lists usint split() again


In [13]:
df['tokens'] = df.txt.str.split()
df.tokens


Out[13]:
0         [python, never, stop, learning, what, you, enj...
1                               [Watching, Boa, vs, Python]
2           [Monty, Python, The, silly, walk, via, YouTube]
3         [Senior, Software, Engineer, Full, Stack, Pyth...
4         [Architect, Django, Solr, Platform, Engineer, ...
5                   [peaceful, rain, Python, inevitability]
                                ...                        
183064    [Las, mejores, ides, para, Python, Antes, de, ...
183065    [Gagal, tidur, gegara, habis, vertical, limit,...
183066                     [Go, boa, wkwk, Boa, vs, Python]
183067    [RT, RealPython, List, of, Python, API, Wrappe...
183068                          [Watching, Boa, vs, Python]
183069    [Чертова, дюжина, вакансий, в, IT, и, Digital,...
Name: tokens, dtype: object

That's more like it, our tokens are now lists of strings not stringified lists of strings ;)


In [16]:
df.tokens.values[0:3]


Out[16]:
array([['python', 'never', 'stop', 'learning', 'what', 'you', 'enjoy', 'doing'],
       ['Watching', 'Boa', 'vs', 'Python'],
       ['Monty', 'Python', 'The', 'silly', 'walk', 'via', 'YouTube']], dtype=object)

In [17]:
d = Dictionary.from_documents(df.tokens)
d


Out[17]:
<gensim.corpora.dictionary.Dictionary at 0x7f523743de80>

In [18]:
tfidf = TfidfModel(d)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-18-af38a11fda4c> in <module>()
----> 1 tfidf = TfidfModel(d)

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    116             if docno % 10000 == 0:
    117                 logger.info("PROGRESS: processing document #%i" % docno)
--> 118             numnnz += len(bow)
    119             for termid, _ in bow:
    120                 dfs[termid] = dfs.get(termid, 0) + 1

TypeError: object of type 'int' has no len()

Hint-Hint: gensim is sprinting this week at PyCon!


In [19]:
TfidfModel?

In [20]:
TfidfModel(df.txt)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-d3b6002038b1> in <module>()
----> 1 TfidfModel(df.txt)

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    117                 logger.info("PROGRESS: processing document #%i" % docno)
    118             numnnz += len(bow)
--> 119             for termid, _ in bow:
    120                 dfs[termid] = dfs.get(termid, 0) + 1
    121 

ValueError: not enough values to unpack (expected 2, got 1)

In [21]:
TfidfModel(df.tokens)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-ddd5365ea7ab> in <module>()
----> 1 TfidfModel(df.tokens)

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    117                 logger.info("PROGRESS: processing document #%i" % docno)
    118             numnnz += len(bow)
--> 119             for termid, _ in bow:
    120                 dfs[termid] = dfs.get(termid, 0) + 1
    121 

ValueError: too many values to unpack (expected 2)

In [10]:
TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-40aa3d3f837d> in <module>()
----> 1 TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in __init__(self, corpus, id2word, dictionary, wlocal, wglobal, normalize)
     94             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
     95         elif corpus is not None:
---> 96             self.initialize(corpus)
     97         else:
     98             # NOTE: everything is left uninitialized; presumably the model will

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/models/tfidfmodel.py in initialize(self, corpus)
    113         dfs = {}
    114         numnnz, docno = 0, -1
--> 115         for docno, bow in enumerate(corpus):
    116             if docno % 10000 == 0:
    117                 logger.info("PROGRESS: processing document #%i" % docno)

<ipython-input-10-40aa3d3f837d> in <genexpr>(.0)
----> 1 TfidfModel((d.doc2bow(tokens) for tokens in df.tokens))

/home/hobs/.virtualenvs/twip3/lib/python3.5/site-packages/gensim/corpora/dictionary.py in doc2bow(self, document, allow_update, return_missing)
    139         """
    140         if isinstance(document, string_types):
--> 141             raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
    142 
    143         # Construct (word, frequency) mapping.

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

But there's a simpler way.
We already have a vocabulary
with term and document frequencies in a matrix...


In [15]:
pd.Series(d.dfs)


Out[15]:
0          444
1         1658
2        53491
3          611
4         9048
5         2374
         ...  
87141        1
87142        1
87143        1
87144        1
87145        1
87146        1
dtype: int64

In [16]:
pd.Series(d.iteritems())


Out[16]:
0        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
1        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
2        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
3        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
4        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
5        ((3906, OBJ), (27629, MyLife), (34812, vacancy...
                               ...                        
87141    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87142    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87143    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87144    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87145    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
87146    ((3906, OBJ), (27629, MyLife), (34812, vacancy...
dtype: object

OK, now I get it

  • document is a list of strings (ordered sequence of tokens)
  • bow or [bag of words] is a list of Counter-like mappings between word IDs and their count in each document
  • TfidfModel is a transformation from a BOW into a BORF, a "bag of relative frequencies"

TFIDF = BORF = term frequencies normalized by document occurence counts


In [21]:
pd.Series(d.doc2bow(toks) for toks in df.tokens[:6])


Out[21]:
0    [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                   [(8, 1), (9, 1), (10, 1), (11, 1)]
2    [(9, 1), (12, 1), (13, 1), (14, 1), (15, 1), (...
3    [(9, 1), (18, 1), (19, 1), (20, 1), (21, 1), (...
4    [(9, 1), (19, 1), (20, 1), (21, 1), (23, 1), (...
5                  [(9, 1), (37, 1), (38, 1), (39, 1)]
dtype: object

Did it assign 0 to the first word it found?
Sort-of...


In [22]:
d.token2id['python']


Out[22]:
0

In [23]:
d.token2id['Python']


Out[23]:
9

In [24]:
d.token2id['you']


Out[24]:
2

In [26]:
d[1]  # guesses anyone?


Out[26]:
'what'

In [ ]:


In [27]:
tfidf = TfidfModel(dictionary=d)
tfidf


Out[27]:
<gensim.models.tfidfmodel.TfidfModel at 0x7f52402b0cf8>

In [30]:
dfs = pd.Series(OrderedDict(sorted([(d.id2token[i], numdocs) for (i, numdocs) in tfidf.dfs.items()])))
dfs


Out[30]:
A           7338
AA             1
AAA            2
AAAA           1
AAAAAA         1
AAAAAAND       2
            ... 
THE            1
W              2
WANT           3
WARNING        1
YOU           10
𝓩Ᏸ             1
dtype: int64

In [27]:
dfs.iloc[4000:4030]


Out[27]:
Bioinformatics    20
Biological         3
Biologist          2
Biologists        13
Biology           17
Biomechanics       1
                  ..
Birkenstocks       1
Birkin             2
Birman            56
Birmann            1
Birmingham        11
Birth             12
dtype: int64

In [28]:
tfidf.num_docs


Out[28]:
183070

In [29]:
tfidf.num_nnz


Out[29]:
2392557

In [30]:
tfidf.save(os.path.join(DATA_PATH, 'tfidf'))

In [31]:
tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))

In [32]:
tfidf2.num_nnz


Out[32]:
2392557

In [ ]: